1   package org.apache.lucene.search.postingshighlight;
2   
3   /*
4    * Licensed to the Apache Software Foundation (ASF) under one or more
5    * contributor license agreements.  See the NOTICE file distributed with
6    * this work for additional information regarding copyright ownership.
7    * The ASF licenses this file to You under the Apache License, Version 2.0
8    * (the "License"); you may not use this file except in compliance with
9    * the License.  You may obtain a copy of the License at
10   *
11   *     http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   */
19  
20  import java.io.IOException;
21  import java.nio.charset.StandardCharsets;
22  import java.text.BreakIterator;
23  import java.util.ArrayList;
24  import java.util.Arrays;
25  import java.util.Comparator;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Locale;
29  import java.util.Map;
30  import java.util.PriorityQueue;
31  import java.util.SortedSet;
32  import java.util.TreeSet;
33  
34  import org.apache.lucene.analysis.Analyzer;
35  import org.apache.lucene.index.FieldInfo;
36  import org.apache.lucene.index.IndexOptions;
37  import org.apache.lucene.index.IndexReader;
38  import org.apache.lucene.index.IndexReaderContext;
39  import org.apache.lucene.index.LeafReader;
40  import org.apache.lucene.index.LeafReaderContext;
41  import org.apache.lucene.index.MultiReader;
42  import org.apache.lucene.index.PostingsEnum;
43  import org.apache.lucene.index.ReaderUtil;
44  import org.apache.lucene.index.StoredFieldVisitor;
45  import org.apache.lucene.index.Term;
46  import org.apache.lucene.index.Terms;
47  import org.apache.lucene.index.TermsEnum;
48  import org.apache.lucene.search.IndexSearcher;
49  import org.apache.lucene.search.Query;
50  import org.apache.lucene.search.ScoreDoc;
51  import org.apache.lucene.search.TopDocs;
52  import org.apache.lucene.util.BytesRef;
53  import org.apache.lucene.util.InPlaceMergeSorter;
54  import org.apache.lucene.util.UnicodeUtil;
55  import org.apache.lucene.util.automaton.CharacterRunAutomaton;
56  
57  /**
58   * Simple highlighter that does not analyze fields nor use
59   * term vectors. Instead it requires 
60   * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
61   * <p>
62   * PostingsHighlighter treats the single original document as the whole corpus, and then scores individual
63   * passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find 
64   * passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale) 
65   * getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through 
66   * the positions of all terms from the query, coalescing those hits that occur in a single passage 
67   * into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}. 
68   * Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
69   * <p>
70   * You can customize the behavior by subclassing this highlighter, some important hooks:
71   * <ul>
72   *   <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
73   *   <li>{@link #getScorer(String)}: Customize how passages are ranked.
74   *   <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
75   *   <li>{@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}.
76   * </ul>
77   * <p>
78   * <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
79   * <p>
80   * Example usage:
81   * <pre class="prettyprint">
82   *   // configure field with offsets at index time
83   *   FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
84   *   offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
85   *   Field body = new Field("body", "foobar", offsetsType);
86   *
87   *   // retrieve highlights at query time 
88   *   PostingsHighlighter highlighter = new PostingsHighlighter();
89   *   Query query = new TermQuery(new Term("body", "highlighting"));
90   *   TopDocs topDocs = searcher.search(query, n);
91   *   String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
92   * </pre>
93   * <p>
94   * This is thread-safe, and can be used across different readers.
95   * @lucene.experimental
96   */
97  public class PostingsHighlighter {
98    
99    // TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
100   // but if the analyzer is really fast and the field is tiny, this might really be
101   // unnecessary.
102   
103   /** for rewriting: we don't want slow processing from MTQs */
104   private static final IndexSearcher EMPTY_INDEXSEARCHER;
105   static {
106     try {
107       IndexReader emptyReader = new MultiReader();
108       EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
109       EMPTY_INDEXSEARCHER.setQueryCache(null);
110     } catch (IOException bogus) {
111       throw new RuntimeException(bogus);
112     }
113   }
114   
115   /** Default maximum content size to process. Typically snippets
116    *  closer to the beginning of the document better summarize its content */
117   public static final int DEFAULT_MAX_LENGTH = 10000;
118     
119   private final int maxLength;
120 
121   /** Set the first time {@link #getFormatter} is called,
122    *  and then reused. */
123   private PassageFormatter defaultFormatter;
124 
125   /** Set the first time {@link #getScorer} is called,
126    *  and then reused. */
127   private PassageScorer defaultScorer;
128   
129   /**
130    * Creates a new highlighter with {@link #DEFAULT_MAX_LENGTH}.
131    */
132   public PostingsHighlighter() {
133     this(DEFAULT_MAX_LENGTH);
134   }
135   
136   /**
137    * Creates a new highlighter, specifying maximum content length.
138    * @param maxLength maximum content size to process.
139    * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
140    */
141   public PostingsHighlighter(int maxLength) {
142     if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
143       // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
144       // our sentinel in the offsets queue uses this value to terminate.
145       throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
146     }
147     this.maxLength = maxLength;
148   }
149   
150   /** Returns the {@link BreakIterator} to use for
151    *  dividing text into passages.  This returns 
152    *  {@link BreakIterator#getSentenceInstance(Locale)} by default;
153    *  subclasses can override to customize. */
154   protected BreakIterator getBreakIterator(String field) {
155     return BreakIterator.getSentenceInstance(Locale.ROOT);
156   }
157 
158   /** Returns the {@link PassageFormatter} to use for
159    *  formatting passages into highlighted snippets.  This
160    *  returns a new {@code PassageFormatter} by default;
161    *  subclasses can override to customize. */
162   protected PassageFormatter getFormatter(String field) {
163     if (defaultFormatter == null) {
164       defaultFormatter = new DefaultPassageFormatter();
165     }
166     return defaultFormatter;
167   }
168 
169   /** Returns the {@link PassageScorer} to use for
170    *  ranking passages.  This
171    *  returns a new {@code PassageScorer} by default;
172    *  subclasses can override to customize. */
173   protected PassageScorer getScorer(String field) {
174     if (defaultScorer == null) {
175       defaultScorer = new PassageScorer();
176     }
177     return defaultScorer;
178   }
179 
180   /**
181    * Highlights the top passages from a single field.
182    * 
183    * @param field field name to highlight. 
184    *        Must have a stored string value and also be indexed with offsets.
185    * @param query query to highlight.
186    * @param searcher searcher that was previously used to execute the query.
187    * @param topDocs TopDocs containing the summary result documents to highlight.
188    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
189    *         If no highlights were found for a document, the
190    *         first sentence for the field will be returned.
191    * @throws IOException if an I/O error occurred during processing
192    * @throws IllegalArgumentException if <code>field</code> was indexed without 
193    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
194    */
195   public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
196     return highlight(field, query, searcher, topDocs, 1);
197   }
198   
199   /**
200    * Highlights the top-N passages from a single field.
201    * 
202    * @param field field name to highlight. 
203    *        Must have a stored string value and also be indexed with offsets.
204    * @param query query to highlight.
205    * @param searcher searcher that was previously used to execute the query.
206    * @param topDocs TopDocs containing the summary result documents to highlight.
207    * @param maxPassages The maximum number of top-N ranked passages used to 
208    *        form the highlighted snippets.
209    * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>. 
210    *         If no highlights were found for a document, the
211    *         first {@code maxPassages} sentences from the
212    *         field will be returned.
213    * @throws IOException if an I/O error occurred during processing
214    * @throws IllegalArgumentException if <code>field</code> was indexed without 
215    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
216    */
217   public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
218     Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
219     return res.get(field);
220   }
221   
222   /**
223    * Highlights the top passages from multiple fields.
224    * <p>
225    * Conceptually, this behaves as a more efficient form of:
226    * <pre class="prettyprint">
227    * Map m = new HashMap();
228    * for (String field : fields) {
229    *   m.put(field, highlight(field, query, searcher, topDocs));
230    * }
231    * return m;
232    * </pre>
233    * 
234    * @param fields field names to highlight. 
235    *        Must have a stored string value and also be indexed with offsets.
236    * @param query query to highlight.
237    * @param searcher searcher that was previously used to execute the query.
238    * @param topDocs TopDocs containing the summary result documents to highlight.
239    * @return Map keyed on field name, containing the array of formatted snippets 
240    *         corresponding to the documents in <code>topDocs</code>. 
241    *         If no highlights were found for a document, the
242    *         first sentence from the field will be returned.
243    * @throws IOException if an I/O error occurred during processing
244    * @throws IllegalArgumentException if <code>field</code> was indexed without 
245    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
246    */
247   public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
248     int maxPassages[] = new int[fields.length];
249     Arrays.fill(maxPassages, 1);
250     return highlightFields(fields, query, searcher, topDocs, maxPassages);
251   }
252   
253   /**
254    * Highlights the top-N passages from multiple fields.
255    * <p>
256    * Conceptually, this behaves as a more efficient form of:
257    * <pre class="prettyprint">
258    * Map m = new HashMap();
259    * for (String field : fields) {
260    *   m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
261    * }
262    * return m;
263    * </pre>
264    * 
265    * @param fields field names to highlight. 
266    *        Must have a stored string value and also be indexed with offsets.
267    * @param query query to highlight.
268    * @param searcher searcher that was previously used to execute the query.
269    * @param topDocs TopDocs containing the summary result documents to highlight.
270    * @param maxPassages The maximum number of top-N ranked passages per-field used to 
271    *        form the highlighted snippets.
272    * @return Map keyed on field name, containing the array of formatted snippets 
273    *         corresponding to the documents in <code>topDocs</code>. 
274    *         If no highlights were found for a document, the
275    *         first {@code maxPassages} sentences from the
276    *         field will be returned.
277    * @throws IOException if an I/O error occurred during processing
278    * @throws IllegalArgumentException if <code>field</code> was indexed without 
279    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
280    */
281   public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
282     final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
283     int docids[] = new int[scoreDocs.length];
284     for (int i = 0; i < docids.length; i++) {
285       docids[i] = scoreDocs[i].doc;
286     }
287 
288     return highlightFields(fields, query, searcher, docids, maxPassages);
289   }
290 
291   /**
292    * Highlights the top-N passages from multiple fields,
293    * for the provided int[] docids.
294    * 
295    * @param fieldsIn field names to highlight. 
296    *        Must have a stored string value and also be indexed with offsets.
297    * @param query query to highlight.
298    * @param searcher searcher that was previously used to execute the query.
299    * @param docidsIn containing the document IDs to highlight.
300    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to 
301    *        form the highlighted snippets.
302    * @return Map keyed on field name, containing the array of formatted snippets 
303    *         corresponding to the documents in <code>docidsIn</code>. 
304    *         If no highlights were found for a document, the
305    *         first {@code maxPassages} from the field will
306    *         be returned.
307    * @throws IOException if an I/O error occurred during processing
308    * @throws IllegalArgumentException if <code>field</code> was indexed without 
309    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
310    */
311   public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
312     Map<String,String[]> snippets = new HashMap<>();
313     for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
314       Object[] snippetObjects = ent.getValue();
315       String[] snippetStrings = new String[snippetObjects.length];
316       snippets.put(ent.getKey(), snippetStrings);
317       for(int i=0;i<snippetObjects.length;i++) {
318         Object snippet = snippetObjects[i];
319         if (snippet != null) {
320           snippetStrings[i] = snippet.toString();
321         }
322       }
323     }
324 
325     return snippets;
326   }
327 
328   /**
329    * Expert: highlights the top-N passages from multiple fields,
330    * for the provided int[] docids, to custom Object as
331    * returned by the {@link PassageFormatter}.  Use
332    * this API to render to something other than String.
333    * 
334    * @param fieldsIn field names to highlight. 
335    *        Must have a stored string value and also be indexed with offsets.
336    * @param query query to highlight.
337    * @param searcher searcher that was previously used to execute the query.
338    * @param docidsIn containing the document IDs to highlight.
339    * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to 
340    *        form the highlighted snippets.
341    * @return Map keyed on field name, containing the array of formatted snippets 
342    *         corresponding to the documents in <code>docidsIn</code>. 
343    *         If no highlights were found for a document, the
344    *         first {@code maxPassages} from the field will
345    *         be returned.
346    * @throws IOException if an I/O error occurred during processing
347    * @throws IllegalArgumentException if <code>field</code> was indexed without 
348    *         {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
349    */
350   protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
351     if (fieldsIn.length < 1) {
352       throw new IllegalArgumentException("fieldsIn must not be empty");
353     }
354     if (fieldsIn.length != maxPassagesIn.length) {
355       throw new IllegalArgumentException("invalid number of maxPassagesIn");
356     }
357     SortedSet<Term> queryTerms = new TreeSet<>();
358     EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
359 
360     IndexReaderContext readerContext = searcher.getIndexReader().getContext();
361     List<LeafReaderContext> leaves = readerContext.leaves();
362 
363     // Make our own copies because we sort in-place:
364     int[] docids = new int[docidsIn.length];
365     System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
366     final String fields[] = new String[fieldsIn.length];
367     System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
368     final int maxPassages[] = new int[maxPassagesIn.length];
369     System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
370 
371     // sort for sequential io
372     Arrays.sort(docids);
373     new InPlaceMergeSorter() {
374 
375       @Override
376       protected void swap(int i, int j) {
377         String tmp = fields[i];
378         fields[i] = fields[j];
379         fields[j] = tmp;
380         int tmp2 = maxPassages[i];
381         maxPassages[i] = maxPassages[j];
382         maxPassages[j] = tmp2;
383       }
384 
385       @Override
386       protected int compare(int i, int j) {
387         return fields[i].compareTo(fields[j]);
388       }
389       
390     }.sort(0, fields.length);
391     
392     // pull stored data:
393     String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
394     
395     Map<String,Object[]> highlights = new HashMap<>();
396     for (int i = 0; i < fields.length; i++) {
397       String field = fields[i];
398       int numPassages = maxPassages[i];
399       Term floor = new Term(field, "");
400       Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
401       SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
402       // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
403 
404       // Strip off the redundant field:
405       BytesRef terms[] = new BytesRef[fieldTerms.size()];
406       int termUpto = 0;
407       for(Term term : fieldTerms) {
408         terms[termUpto++] = term.bytes();
409       }
410       Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
411         
412       Object[] result = new Object[docids.length];
413       for (int j = 0; j < docidsIn.length; j++) {
414         result[j] = fieldHighlights.get(docidsIn[j]);
415       }
416       highlights.put(field, result);
417     }
418     return highlights;
419   }
420 
421   /** Loads the String values for each field X docID to be
422    *  highlighted.  By default this loads from stored
423    *  fields, but a subclass can change the source.  This
424    *  method should allocate the String[fields.length][docids.length]
425    *  and fill all values.  The returned Strings must be
426    *  identical to what was indexed. */
427   protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
428     String contents[][] = new String[fields.length][docids.length];
429     char valueSeparators[] = new char[fields.length];
430     for (int i = 0; i < fields.length; i++) {
431       valueSeparators[i] = getMultiValuedSeparator(fields[i]);
432     }
433     LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
434     for (int i = 0; i < docids.length; i++) {
435       searcher.doc(docids[i], visitor);
436       for (int j = 0; j < fields.length; j++) {
437         contents[j][i] = visitor.getValue(j).toString();
438       }
439       visitor.reset();
440     }
441     return contents;
442   }
443   
444   /** 
445    * Returns the logical separator between values for multi-valued fields.
446    * The default value is a space character, which means passages can span across values,
447    * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
448    * if each value holds a discrete passage for highlighting.
449    */
450   protected char getMultiValuedSeparator(String field) {
451     return ' ';
452   }
453   
454   /** 
455    * Returns the analyzer originally used to index the content for {@code field}.
456    * <p>
457    * This is used to highlight some MultiTermQueries.
458    * @return Analyzer or null (the default, meaning no special multi-term processing)
459    */
460   protected Analyzer getIndexAnalyzer(String field) {
461     return null;
462   }
463     
464   private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<LeafReaderContext> leaves, int maxPassages, Query query) throws IOException {  
465     Map<Integer,Object> highlights = new HashMap<>();
466 
467     PassageFormatter fieldFormatter = getFormatter(field);
468     if (fieldFormatter == null) {
469       throw new NullPointerException("PassageFormatter cannot be null");
470     }
471     
472     // check if we should do any multiterm processing
473     Analyzer analyzer = getIndexAnalyzer(field);
474     CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
475     if (analyzer != null) {
476       automata = MultiTermHighlighting.extractAutomata(query, field);
477     }
478     
479     // resize 'terms', where the last term is the multiterm matcher
480     if (automata.length > 0) {
481       BytesRef newTerms[] = new BytesRef[terms.length + 1];
482       System.arraycopy(terms, 0, newTerms, 0, terms.length);
483       terms = newTerms;
484     }
485 
486     // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
487     // otherwise, we will just advance() existing enums to the new document in the same segment.
488     PostingsEnum postings[] = null;
489     TermsEnum termsEnum = null;
490     int lastLeaf = -1;
491     
492     for (int i = 0; i < docids.length; i++) {
493       String content = contents[i];
494       if (content.length() == 0) {
495         continue; // nothing to do
496       }
497       bi.setText(content);
498       int doc = docids[i];
499       int leaf = ReaderUtil.subIndex(doc, leaves);
500       LeafReaderContext subContext = leaves.get(leaf);
501       LeafReader r = subContext.reader();
502       
503       assert leaf >= lastLeaf; // increasing order
504       
505       // if the segment has changed, we must initialize new enums.
506       if (leaf != lastLeaf) {
507         Terms t = r.terms(field);
508         if (t != null) {
509           if (!t.hasOffsets()) {
510             // no offsets available
511             throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
512           }
513           termsEnum = t.iterator();
514           postings = new PostingsEnum[terms.length];
515         } else {
516           termsEnum = null;
517         }
518       }
519       if (termsEnum == null) {
520         continue; // no terms for this field, nothing to do
521       }
522       
523       // if there are multi-term matches, we have to initialize the "fake" enum for each document
524       if (automata.length > 0) {
525         PostingsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
526         dp.advance(doc - subContext.docBase);
527         postings[terms.length-1] = dp; // last term is the multiterm matcher
528       }
529       
530       Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
531       
532       if (passages.length == 0) {
533         // no passages were returned, so ask for a default summary
534         passages = getEmptyHighlight(field, bi, maxPassages);
535       }
536 
537       if (passages.length > 0) {
538         highlights.put(doc, fieldFormatter.format(passages, content));
539       }
540       
541       lastLeaf = leaf;
542     }
543     
544     return highlights;
545   }
546   
547   // algorithm: treat sentence snippets as miniature documents
548   // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
549   // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
550   private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc, 
551       TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException {
552     PassageScorer scorer = getScorer(field);
553     if (scorer == null) {
554       throw new NullPointerException("PassageScorer cannot be null");
555     }
556     PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
557     float weights[] = new float[terms.length];
558     // initialize postings
559     for (int i = 0; i < terms.length; i++) {
560       PostingsEnum de = postings[i];
561       int pDoc;
562       if (de == EMPTY) {
563         continue;
564       } else if (de == null) {
565         postings[i] = EMPTY; // initially
566         if (!termsEnum.seekExact(terms[i])) {
567           continue; // term not found
568         }
569         de = postings[i] = termsEnum.postings(null, PostingsEnum.OFFSETS);
570         assert de != null;
571         pDoc = de.advance(doc);
572       } else {
573         pDoc = de.docID();
574         if (pDoc < doc) {
575           pDoc = de.advance(doc);
576         }
577       }
578 
579       if (doc == pDoc) {
580         weights[i] = scorer.weight(contentLength, de.freq());
581         de.nextPosition();
582         pq.add(new OffsetsEnum(de, i));
583       }
584     }
585     
586     pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination
587     
588     PriorityQueue<Passage> passageQueue = new PriorityQueue<>(n, new Comparator<Passage>() {
589       @Override
590       public int compare(Passage left, Passage right) {
591         if (left.score < right.score) {
592           return -1;
593         } else if (left.score > right.score) {
594           return 1;
595         } else {
596           return left.startOffset - right.startOffset;
597         }
598       }
599     });
600     Passage current = new Passage();
601     
602     OffsetsEnum off;
603     while ((off = pq.poll()) != null) {
604       final PostingsEnum dp = off.dp;
605       int start = dp.startOffset();
606       assert start >= 0;
607       int end = dp.endOffset();
608       // LUCENE-5166: this hit would span the content limit... however more valid 
609       // hits may exist (they are sorted by start). so we pretend like we never 
610       // saw this term, it won't cause a passage to be added to passageQueue or anything.
611       assert EMPTY.startOffset() == Integer.MAX_VALUE;
612       if (start < contentLength && end > contentLength) {
613         continue;
614       }
615       if (start >= current.endOffset) {
616         if (current.startOffset >= 0) {
617           // finalize current
618           current.score *= scorer.norm(current.startOffset);
619           // new sentence: first add 'current' to queue 
620           if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
621             current.reset(); // can't compete, just reset it
622           } else {
623             passageQueue.offer(current);
624             if (passageQueue.size() > n) {
625               current = passageQueue.poll();
626               current.reset();
627             } else {
628               current = new Passage();
629             }
630           }
631         }
632         // if we exceed limit, we are done
633         if (start >= contentLength) {
634           Passage passages[] = new Passage[passageQueue.size()];
635           passageQueue.toArray(passages);
636           for (Passage p : passages) {
637             p.sort();
638           }
639           // sort in ascending order
640           Arrays.sort(passages, new Comparator<Passage>() {
641             @Override
642             public int compare(Passage left, Passage right) {
643               return left.startOffset - right.startOffset;
644             }
645           });
646           return passages;
647         }
648         // advance breakiterator
649         assert BreakIterator.DONE < 0;
650         current.startOffset = Math.max(bi.preceding(start+1), 0);
651         current.endOffset = Math.min(bi.next(), contentLength);
652       }
653       int tf = 0;
654       while (true) {
655         tf++;
656         BytesRef term = terms[off.id];
657         if (term == null) {
658           // multitermquery match, pull from payload
659           term = off.dp.getPayload();
660           assert term != null;
661         }
662         current.addMatch(start, end, term);
663         if (off.pos == dp.freq()) {
664           break; // removed from pq
665         } else {
666           off.pos++;
667           dp.nextPosition();
668           start = dp.startOffset();
669           end = dp.endOffset();
670         }
671         if (start >= current.endOffset || end > contentLength) {
672           pq.offer(off);
673           break;
674         }
675       }
676       current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
677     }
678 
679     // Dead code but compiler disagrees:
680     assert false;
681     return null;
682   }
683 
684   /** Called to summarize a document when no hits were
685    *  found.  By default this just returns the first
686    *  {@code maxPassages} sentences; subclasses can override
687    *  to customize. */
688   protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
689     // BreakIterator should be un-next'd:
690     List<Passage> passages = new ArrayList<>();
691     int pos = bi.current();
692     assert pos == 0;
693     while (passages.size() < maxPassages) {
694       int next = bi.next();
695       if (next == BreakIterator.DONE) {
696         break;
697       }
698       Passage passage = new Passage();
699       passage.score = Float.NaN;
700       passage.startOffset = pos;
701       passage.endOffset = next;
702       passages.add(passage);
703       pos = next;
704     }
705 
706     return passages.toArray(new Passage[passages.size()]);
707   }
708   
709   private static class OffsetsEnum implements Comparable<OffsetsEnum> {
710     PostingsEnum dp;
711     int pos;
712     int id;
713     
714     OffsetsEnum(PostingsEnum dp, int id) throws IOException {
715       this.dp = dp;
716       this.id = id;
717       this.pos = 1;
718     }
719 
720     @Override
721     public int compareTo(OffsetsEnum other) {
722       try {
723         int off = dp.startOffset();
724         int otherOff = other.dp.startOffset();
725         if (off == otherOff) {
726           return id - other.id;
727         } else {
728           return Integer.compare(off, otherOff);
729         }
730       } catch (IOException e) {
731         throw new RuntimeException(e);
732       }
733     }
734   }
735   
736   private static final PostingsEnum EMPTY = new PostingsEnum() {
737 
738     @Override
739     public int nextPosition() throws IOException { return -1; }
740 
741     @Override
742     public int startOffset() throws IOException { return Integer.MAX_VALUE; }
743 
744     @Override
745     public int endOffset() throws IOException { return Integer.MAX_VALUE; }
746 
747     @Override
748     public BytesRef getPayload() throws IOException { return null; }
749 
750     @Override
751     public int freq() throws IOException { return 0; }
752 
753     @Override
754     public int docID() { return NO_MORE_DOCS; }
755 
756     @Override
757     public int nextDoc() throws IOException { return NO_MORE_DOCS; }
758 
759     @Override
760     public int advance(int target) throws IOException { return NO_MORE_DOCS; }
761     
762     @Override
763     public long cost() { return 0; }
764   };
765   
766   private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
767     private final String fields[];
768     private final char valueSeparators[];
769     private final int maxLength;
770     private final StringBuilder builders[];
771     private int currentField = -1;
772     
773     public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
774       assert fields.length == valueSeparators.length;
775       this.fields = fields;
776       this.valueSeparators = valueSeparators;
777       this.maxLength = maxLength;
778       builders = new StringBuilder[fields.length];
779       for (int i = 0; i < builders.length; i++) {
780         builders[i] = new StringBuilder();
781       }
782     }
783     
784     @Override
785     public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
786       String value = new String(bytes, StandardCharsets.UTF_8);
787       assert currentField >= 0;
788       StringBuilder builder = builders[currentField];
789       if (builder.length() > 0 && builder.length() < maxLength) {
790         builder.append(valueSeparators[currentField]);
791       }
792       if (builder.length() + value.length() > maxLength) {
793         builder.append(value, 0, maxLength - builder.length());
794       } else {
795         builder.append(value);
796       }
797     }
798 
799     @Override
800     public Status needsField(FieldInfo fieldInfo) throws IOException {
801       currentField = Arrays.binarySearch(fields, fieldInfo.name);
802       if (currentField < 0) {
803         return Status.NO;
804       } else if (builders[currentField].length() > maxLength) {
805         return fields.length == 1 ? Status.STOP : Status.NO;
806       }
807       return Status.YES;
808     }
809     
810     String getValue(int i) {
811       return builders[i].toString();
812     }
813     
814     void reset() {
815       currentField = -1;
816       for (int i = 0; i < fields.length; i++) {
817         builders[i].setLength(0);
818       }
819     }
820   }
821 }